import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #importing all necessary Libraries
life_expectancy = pd.read_csv('/content/Life Expectancy Data.csv') #Reading the data set
life_expectancy.head() #gives top 5 rows with atrributes
life_expectancy.dropna(inplace=True) # drop rows with missing values
life_expectancy.dtypes
Country object Year int64 Status object Life expectancy float64 Adult Mortality float64 infant deaths int64 Alcohol float64 percentage expenditure float64 Hepatitis B float64 Measles int64 BMI float64 under-five deaths int64 Polio float64 Total expenditure float64 Diphtheria float64 HIV/AIDS float64 GDP float64 Population float64 thinness 1-19 years float64 thinness 5-9 years float64 Income composition of resources float64 Schooling float64 dtype: object
life_expectancy.describe() #statistical summary of the arributes with numerical values
| Year | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 | 1.649000e+03 | 1649.000000 | 1649.000000 | 1649.000000 | 1649.000000 |
| mean | 2007.840509 | 69.302304 | 168.215282 | 32.553062 | 4.533196 | 698.973558 | 79.217708 | 2224.494239 | 38.128623 | 44.220133 | 83.564585 | 5.955925 | 84.155246 | 1.983869 | 5566.031887 | 1.465363e+07 | 4.850637 | 4.907762 | 0.631551 | 12.119891 |
| std | 4.087711 | 8.796834 | 125.310417 | 120.847190 | 4.029189 | 1759.229336 | 25.604664 | 10085.802019 | 19.754249 | 162.897999 | 22.450557 | 2.299385 | 21.579193 | 6.032360 | 11475.900117 | 7.046039e+07 | 4.599228 | 4.653757 | 0.183089 | 2.795388 |
| min | 2000.000000 | 44.000000 | 1.000000 | 0.000000 | 0.010000 | 0.000000 | 2.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 0.740000 | 2.000000 | 0.100000 | 1.681350 | 3.400000e+01 | 0.100000 | 0.100000 | 0.000000 | 4.200000 |
| 25% | 2005.000000 | 64.400000 | 77.000000 | 1.000000 | 0.810000 | 37.438577 | 74.000000 | 0.000000 | 19.500000 | 1.000000 | 81.000000 | 4.410000 | 82.000000 | 0.100000 | 462.149650 | 1.918970e+05 | 1.600000 | 1.700000 | 0.509000 | 10.300000 |
| 50% | 2008.000000 | 71.700000 | 148.000000 | 3.000000 | 3.790000 | 145.102253 | 89.000000 | 15.000000 | 43.700000 | 4.000000 | 93.000000 | 5.840000 | 92.000000 | 0.100000 | 1592.572182 | 1.419631e+06 | 3.000000 | 3.200000 | 0.673000 | 12.300000 |
| 75% | 2011.000000 | 75.000000 | 227.000000 | 22.000000 | 7.340000 | 509.389994 | 96.000000 | 373.000000 | 55.800000 | 29.000000 | 97.000000 | 7.470000 | 97.000000 | 0.700000 | 4718.512910 | 7.658972e+06 | 7.100000 | 7.100000 | 0.751000 | 14.000000 |
| max | 2015.000000 | 89.000000 | 723.000000 | 1600.000000 | 17.870000 | 18961.348600 | 99.000000 | 131441.000000 | 77.100000 | 2100.000000 | 99.000000 | 14.390000 | 99.000000 | 50.600000 | 119172.741800 | 1.293859e+09 | 27.200000 | 28.200000 | 0.936000 | 20.700000 |
life_expectancy.drop(['Country','Status'],inplace=True,axis=1) #not useful for the linear regression model.
life_expectancy
| Year | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015 | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | 19.1 | 83 | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 |
| 1 | 2014 | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | 18.6 | 86 | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 |
| 2 | 2013 | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | 18.1 | 89 | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 |
| 3 | 2012 | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | 17.6 | 93 | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 |
| 4 | 2011 | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | 17.2 | 97 | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2933 | 2004 | 44.3 | 723.0 | 27 | 4.36 | 0.000000 | 68.0 | 31 | 27.1 | 42 | 67.0 | 7.13 | 65.0 | 33.6 | 454.366654 | 12777511.0 | 9.4 | 9.4 | 0.407 | 9.2 |
| 2934 | 2003 | 44.5 | 715.0 | 26 | 4.06 | 0.000000 | 7.0 | 998 | 26.7 | 41 | 7.0 | 6.52 | 68.0 | 36.7 | 453.351155 | 12633897.0 | 9.8 | 9.9 | 0.418 | 9.5 |
| 2935 | 2002 | 44.8 | 73.0 | 25 | 4.43 | 0.000000 | 73.0 | 304 | 26.3 | 40 | 73.0 | 6.53 | 71.0 | 39.8 | 57.348340 | 125525.0 | 1.2 | 1.3 | 0.427 | 10.0 |
| 2936 | 2001 | 45.3 | 686.0 | 25 | 1.72 | 0.000000 | 76.0 | 529 | 25.9 | 39 | 76.0 | 6.16 | 75.0 | 42.1 | 548.587312 | 12366165.0 | 1.6 | 1.7 | 0.427 | 9.8 |
| 2937 | 2000 | 46.0 | 665.0 | 24 | 1.68 | 0.000000 | 79.0 | 1483 | 25.5 | 39 | 78.0 | 7.10 | 78.0 | 43.5 | 547.358878 | 12222251.0 | 11.0 | 11.2 | 0.434 | 9.8 |
1649 rows × 20 columns
def make_corr_heatmap(df): #function that plots correlation heatmap
corr = df.corr()
fig, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corr, annot=True)
ax.set_xticklabels(ax.get_xticklabels())
plt.show()
make_corr_heatmap(life_expectancy)
sns.pairplot(life_expectancy,diag_kind='kde') #displays pair plot with kernel density
<seaborn.axisgrid.PairGrid at 0x7e0c2d4d65f0>
#year,infant_deaths,measles,population,total_expenditure
print(life_expectancy.columns)
life_expectancy.drop(['Year','infant deaths','Measles ','Total expenditure','Population'],inplace=True,axis=1) #less correlation with life expectancy: redundant attritubes
#life_expectancy.isnull().count()
Index(['Year', 'Life expectancy ', 'Adult Mortality', 'infant deaths',
'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
' HIV/AIDS', 'GDP', 'Population', ' thinness 1-19 years',
' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
dtype='object')
X=life_expectancy.drop('Life expectancy ',axis=1)
Y=life_expectancy['Life expectancy ']
print(X,Y)
#split the dataset into training and testing sets to then train a linear regression model using scikit-learn.
Adult Mortality Alcohol percentage expenditure Hepatitis B BMI \
0 263.0 0.01 71.279624 65.0 19.1
1 271.0 0.01 73.523582 62.0 18.6
2 268.0 0.01 73.219243 64.0 18.1
3 272.0 0.01 78.184215 67.0 17.6
4 275.0 0.01 7.097109 68.0 17.2
... ... ... ... ... ...
2933 723.0 4.36 0.000000 68.0 27.1
2934 715.0 4.06 0.000000 7.0 26.7
2935 73.0 4.43 0.000000 73.0 26.3
2936 686.0 1.72 0.000000 76.0 25.9
2937 665.0 1.68 0.000000 79.0 25.5
under-five deaths Polio Diphtheria HIV/AIDS GDP \
0 83 6.0 65.0 0.1 584.259210
1 86 58.0 62.0 0.1 612.696514
2 89 62.0 64.0 0.1 631.744976
3 93 67.0 67.0 0.1 669.959000
4 97 68.0 68.0 0.1 63.537231
... ... ... ... ... ...
2933 42 67.0 65.0 33.6 454.366654
2934 41 7.0 68.0 36.7 453.351155
2935 40 73.0 71.0 39.8 57.348340
2936 39 76.0 75.0 42.1 548.587312
2937 39 78.0 78.0 43.5 547.358878
thinness 1-19 years thinness 5-9 years \
0 17.2 17.3
1 17.5 17.5
2 17.7 17.7
3 17.9 18.0
4 18.2 18.2
... ... ...
2933 9.4 9.4
2934 9.8 9.9
2935 1.2 1.3
2936 1.6 1.7
2937 11.0 11.2
Income composition of resources Schooling
0 0.479 10.1
1 0.476 10.0
2 0.470 9.9
3 0.463 9.8
4 0.454 9.5
... ... ...
2933 0.407 9.2
2934 0.418 9.5
2935 0.427 10.0
2936 0.427 9.8
2937 0.434 9.8
[1649 rows x 14 columns] 0 65.0
1 59.9
2 59.9
3 59.5
4 59.2
...
2933 44.3
2934 44.5
2935 44.8
2936 45.3
2937 46.0
Name: Life expectancy , Length: 1649, dtype: float64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,X_test,Y_train,Y_test = train_test_split(X.values,Y.values,test_size=0.2) #dataset is split in the 4:1 ratio for training and testing
model=LinearRegression()
model.fit(X_train,Y_train) #(training) fiting the data to the dataset
print(model.score(X_train,Y_train)) #r2 score
print(model.score(X_test,Y_test)) #print the R2 score for both the training and testing sets to evaluate the model's performance.
0.8289517081052937 0.812440597177055
life_expectancy_coeff = pd.DataFrame(model.coef_,X.columns,columns=['Coefficient'])
life_expectancy_coeff #Understanding the coefficients helps to interpret the impact of each feature on the predicted outcome.
| Coefficient | |
|---|---|
| Adult Mortality | -0.018759 |
| Alcohol | -0.139051 |
| percentage expenditure | 0.000403 |
| Hepatitis B | -0.003710 |
| BMI | 0.033776 |
| under-five deaths | -0.001319 |
| Polio | 0.008982 |
| Diphtheria | 0.016367 |
| HIV/AIDS | -0.414523 |
| GDP | 0.000018 |
| thinness 1-19 years | 0.035633 |
| thinness 5-9 years | -0.068861 |
| Income composition of resources | 10.633458 |
| Schooling | 0.960030 |
model.intercept_
52.25524864107931
predictions = model.predict(X_test) #make predictions on the test set
plt.scatter(Y_test,predictions) # visualize the predicted values against the actual values using a scatter plot
<matplotlib.collections.PathCollection at 0x7e0c1bba6920>
coefficients = pd.DataFrame(data=model.coef_.reshape(-1,1),index=X.columns,columns=["coefficients"]) #create a DataFrame to show the coefficients for each feature
print(coefficients)
print("Intercept:", model.intercept_) #print the intercept
print("Function: \n", "y = " + " +\n".join([f"({val:.2f})*({X.columns[ind]})" for ind,val in enumerate(model.coef_)]) + f" +\n({model.intercept_})") #print the linear regression equation
coefficients Adult Mortality -0.018759 Alcohol -0.139051 percentage expenditure 0.000403 Hepatitis B -0.003710 BMI 0.033776 under-five deaths -0.001319 Polio 0.008982 Diphtheria 0.016367 HIV/AIDS -0.414523 GDP 0.000018 thinness 1-19 years 0.035633 thinness 5-9 years -0.068861 Income composition of resources 10.633458 Schooling 0.960030 Intercept: 52.25524864107931 Function: y = (-0.02)*(Adult Mortality) + (-0.14)*(Alcohol) + (0.00)*(percentage expenditure) + (-0.00)*(Hepatitis B) + (0.03)*( BMI ) + (-0.00)*(under-five deaths ) + (0.01)*(Polio) + (0.02)*(Diphtheria ) + (-0.41)*( HIV/AIDS) + (0.00)*(GDP) + (0.04)*( thinness 1-19 years) + (-0.07)*( thinness 5-9 years) + (10.63)*(Income composition of resources) + (0.96)*(Schooling) + (52.25524864107931)
# MAE MSE RMSE
from sklearn import metrics
print('MAE1:', metrics.mean_absolute_error(Y_test, predictions))
print('MSE1:', metrics.mean_squared_error(Y_test, predictions))
print('RMSE1:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
MAE1: 3.0756719082362913 MSE1: 16.037107579841344 RMSE1: 4.004635761194936
import statsmodels.api as sm
from scipy import stats
X_ = sm.add_constant(X_train)
est = sm.OLS(Y_train, X_)
est2 = est.fit()
print(est2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.829
Model: OLS Adj. R-squared: 0.827
Method: Least Squares F-statistic: 451.4
Date: Sun, 21 Jan 2024 Prob (F-statistic): 0.00
Time: 18:12:45 Log-Likelihood: -3556.8
No. Observations: 1319 AIC: 7144.
Df Residuals: 1304 BIC: 7221.
Df Model: 14
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 52.2552 0.785 66.602 0.000 50.716 53.794
x1 -0.0188 0.001 -17.405 0.000 -0.021 -0.017
x2 -0.1391 0.034 -4.145 0.000 -0.205 -0.073
x3 0.0004 0.000 1.985 0.047 4.65e-06 0.001
x4 -0.0037 0.005 -0.726 0.468 -0.014 0.006
x5 0.0338 0.007 5.051 0.000 0.021 0.047
x6 -0.0013 0.001 -1.883 0.060 -0.003 5.5e-05
x7 0.0090 0.006 1.525 0.127 -0.003 0.021
x8 0.0164 0.007 2.444 0.015 0.003 0.030
x9 -0.4145 0.021 -20.020 0.000 -0.455 -0.374
x10 1.827e-05 3.22e-05 0.568 0.570 -4.49e-05 8.14e-05
x11 0.0356 0.058 0.609 0.542 -0.079 0.150
x12 -0.0689 0.058 -1.197 0.232 -0.182 0.044
x13 10.6335 0.957 11.113 0.000 8.756 12.511
x14 0.9600 0.067 14.318 0.000 0.828 1.092
==============================================================================
Omnibus: 40.658 Durbin-Watson: 2.006
Prob(Omnibus): 0.000 Jarque-Bera (JB): 66.118
Skew: -0.263 Prob(JB): 4.39e-15
Kurtosis: 3.963 Cond. No. 1.22e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.22e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
col_mask = (est2.pvalues[1:])<0.05
X.columns[col_mask]
Index(['Adult Mortality', 'Alcohol', 'percentage expenditure', ' BMI ',
'Diphtheria ', ' HIV/AIDS', 'Income composition of resources',
'Schooling'],
dtype='object')
col_mask2 = np.logical_not(col_mask)
X.columns[col_mask2]
Index(['Hepatitis B', 'under-five deaths ', 'Polio', 'GDP',
' thinness 1-19 years', ' thinness 5-9 years'],
dtype='object')
#life_expectancy.drop(['Polio','Hepatitis B','GDP',' thinness 1-19 years','under-five deaths ',' thinness 5-9 years'],axis=1,inplace=True)
life_expectancy.head(1)
| Life expectancy | Adult Mortality | Alcohol | percentage expenditure | BMI | Diphtheria | HIV/AIDS | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 65.0 | 263.0 | 0.01 | 71.279624 | 19.1 | 65.0 | 0.1 | 0.479 | 10.1 |
X2=life_expectancy.drop('Life expectancy ',axis=1)
Y2=life_expectancy['Life expectancy ']
print(X2,Y2)
Adult Mortality Alcohol percentage expenditure BMI Diphtheria \
0 263.0 0.01 71.279624 19.1 65.0
1 271.0 0.01 73.523582 18.6 62.0
2 268.0 0.01 73.219243 18.1 64.0
3 272.0 0.01 78.184215 17.6 67.0
4 275.0 0.01 7.097109 17.2 68.0
... ... ... ... ... ...
2933 723.0 4.36 0.000000 27.1 65.0
2934 715.0 4.06 0.000000 26.7 68.0
2935 73.0 4.43 0.000000 26.3 71.0
2936 686.0 1.72 0.000000 25.9 75.0
2937 665.0 1.68 0.000000 25.5 78.0
HIV/AIDS Income composition of resources Schooling
0 0.1 0.479 10.1
1 0.1 0.476 10.0
2 0.1 0.470 9.9
3 0.1 0.463 9.8
4 0.1 0.454 9.5
... ... ... ...
2933 33.6 0.407 9.2
2934 36.7 0.418 9.5
2935 39.8 0.427 10.0
2936 42.1 0.427 9.8
2937 43.5 0.434 9.8
[1649 rows x 8 columns] 0 65.0
1 59.9
2 59.9
3 59.5
4 59.2
...
2933 44.3
2934 44.5
2935 44.8
2936 45.3
2937 46.0
Name: Life expectancy , Length: 1649, dtype: float64
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X2_train,X2_test,Y2_train,Y2_test = train_test_split(X2.values,Y2.values,test_size=0.2)
model2=LinearRegression()
model2.fit(X2_train,Y2_train)
print(model.score(X2_train,Y2_train)) #r2 score
print(model.score(X2_test,Y2_test))
0.8243791898208145 0.8209650654574213
life_expectancy_coeff2 = pd.DataFrame(model2.coef_,X2.columns,columns=['Coefficient'])
life_expectancy_coeff2
| Coefficient | |
|---|---|
| Adult Mortality | -0.018239 |
| Alcohol | -0.094395 |
| percentage expenditure | 0.000447 |
| BMI | 0.040110 |
| Diphtheria | 0.020884 |
| HIV/AIDS | -0.476124 |
| Income composition of resources | 9.661914 |
| Schooling | 1.012723 |
model2.intercept_
51.68852172103817
predictions2 = model2.predict(X2_test)
plt.scatter(Y2_test,predictions2)
<matplotlib.collections.PathCollection at 0x7e0c15a03af0>
coefficients2 = pd.DataFrame(data=model2.coef_.reshape(-1,1),index=X2.columns,columns=["coefficients"]) #create a DataFrame to show the coefficients for each feature
print(coefficients2)
print("Intercept:", model2.intercept_) #print the intercept
print("Function: \n", "y = " + " +\n".join([f"({val:.2f})*({X2.columns[ind]})" for ind,val in enumerate(model2.coef_)]) + f" +\n({model2.intercept_})")
coefficients Adult Mortality -0.018239 Alcohol -0.094395 percentage expenditure 0.000447 BMI 0.040110 Diphtheria 0.020884 HIV/AIDS -0.476124 Income composition of resources 9.661914 Schooling 1.012723 Intercept: 51.68852172103817 Function: y = (-0.02)*(Adult Mortality) + (-0.09)*(Alcohol) + (0.00)*(percentage expenditure) + (0.04)*( BMI ) + (0.02)*(Diphtheria ) + (-0.48)*( HIV/AIDS) + (9.66)*(Income composition of resources) + (1.01)*(Schooling) + (51.68852172103817)
X_2 = sm.add_constant(X2_train)
est2_2 = sm.OLS(Y2_train, X_2)
est22 = est2_2.fit()
print(est22.summary())
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.826
Model: OLS Adj. R-squared: 0.825
Method: Least Squares F-statistic: 776.7
Date: Sun, 21 Jan 2024 Prob (F-statistic): 0.00
Time: 18:17:55 Log-Likelihood: -3585.5
No. Observations: 1319 AIC: 7189.
Df Residuals: 1310 BIC: 7236.
Df Model: 8
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 51.6885 0.687 75.234 0.000 50.341 53.036
x1 -0.0182 0.001 -16.895 0.000 -0.020 -0.016
x2 -0.0944 0.034 -2.808 0.005 -0.160 -0.028
x3 0.0004 6.46e-05 6.916 0.000 0.000 0.001
x4 0.0401 0.006 6.519 0.000 0.028 0.052
x5 0.0209 0.005 4.305 0.000 0.011 0.030
x6 -0.4761 0.023 -21.136 0.000 -0.520 -0.432
x7 9.6619 0.916 10.550 0.000 7.865 11.459
x8 1.0127 0.066 15.373 0.000 0.883 1.142
==============================================================================
Omnibus: 37.110 Durbin-Watson: 2.029
Prob(Omnibus): 0.000 Jarque-Bera (JB): 62.228
Skew: -0.230 Prob(JB): 3.07e-14
Kurtosis: 3.960 Cond. No. 1.76e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.76e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
col_mask2 = (est22.pvalues[1:])<0.05
print(col_mask2)
[ True True True True True True True True]
print('MAE2:', metrics.mean_absolute_error(Y2_test, predictions2))
print('MSE2:', metrics.mean_squared_error(Y2_test, predictions2))
print('RMSE2:', np.sqrt(metrics.mean_squared_error(Y2_test, predictions2)))
MAE1: 2.8738818423360657 MSE1: 14.660168731913682 RMSE1: 3.828859978102318
D={
'Model name':['Model 1','Model 2'],
'Independent Variables': ['Adult Mortality, Alcohol, percentage expenditure, Hepatitis B, BMI, under-five deaths, Polio, Diphtheria, HIV/AIDS, GDP, thinness 1-19 years, thinness 5-9 years, Income composition of resources, Schooling','Adult Mortality, Alcohol, percentage expenditure, BMI, Diphtheria, HIV/AIDS, Income composition of resources, Schooling'],
'MAE':['3.0756719082362913','2.8738818423360657'],
'MSE':['16.037107579841344','14.660168731913682'],
'RMSE':['4.004635761194936','3.828859978102318'],
'R_SQ':['0.812440597177055','0.8209650654574213']
}
DF = pd.DataFrame(data=D)
DF
| Model name | Independent Variables | MAE | MSE | RMSE | R_SQ | |
|---|---|---|---|---|---|---|
| 0 | Model 1 | Adult Mortality, Alcohol, percentage expenditu... | 3.0756719082362913 | 16.037107579841344 | 4.004635761194936 | 0.812440597177055 |
| 1 | Model 2 | Adult Mortality, Alcohol, percentage expenditu... | 2.8738818423360657 | 14.660168731913682 | 3.828859978102318 | 0.8209650654574213 |
Vrushali Kadam - 22BLC1300
Objective:
The objective of this analysis is to build and evaluate a linear regression model for predicting life expectancy based on a given dataset. This involves comprehensive data preprocessing, exploratory data analysis (EDA), and the application of a linear regression model with subsequent evaluation.
Data Loading and Exploration:
We initiated the analysis by loading the life expectancy dataset using Pandas:
import pandas as pd
life_expectancy = pd.read_csv('/content/Life Expectancy Data.csv') life_expectancy.dropna(inplace=True)
Subsequently, we inspected the dataset by examining data types, and providing a statistical summary:
life_expectancy.dtypes life_expectancy.describe()
Data Preprocessing:
To streamline the dataset for linear regression modeling, we removed irrelevant columns ('Country' and 'Status'):
life_expectancy.drop(['Country', 'Status'], inplace=True, axis=1)
Exploratory Data Analysis (EDA):
EDA was performed to better understand relationships within the data. We created a correlation heatmap and a pair plot:
import matplotlib.pyplot as plt import seaborn as sns
def make_corr_heatmap(df): corr = df.corr() fig, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corr, annot=True) ax.set_xticklabels(ax.get_xticklabels()) plt.show()
make_corr_heatmap(life_expectancy) sns.pairplot(life_expectancy, diag_kind='kde')
Methodology:
Linear Regression Modeling:
The dataset was split into training and testing sets, and a linear regression model was trained using scikit-learn:
from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression
X = life_expectancy.drop('Life expectancy ', axis=1) Y = life_expectancy['Life expectancy ']
X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y.values, test_size=0.2) model = LinearRegression() model.fit(X_train, Y_train)
Model Evaluation and Coefficients:
The model's performance was evaluated using R2 score,MAE,MSE,RMSE. Additionally, we examined the coefficients and intercept of the linear regression model:
For Model 1:
print(model.score(X_train, Y_train)) print(model.score(X_test, Y_test))
life_expectancycoeff = pd.DataFrame(model.coef, X.columns, columns=['Coefficient']) print(life_expectancy_coeff)
from sklearn import metrics print('MAE1:', metrics.mean_absolute_error(Y_test, predictions)) print('MSE1:', metrics.mean_squared_error(Y_test, predictions)) print('RMSE1:', np.sqrt(metrics.mean_squared_error(Y_test, predictions)))
Similarly is done for the Model 2 as well
Results:
We limited the number of models to two due to the significant p-values obtained for all variables in the second model. Model 2 emerges as the optimal choice among various models, providing insights into the impact of relevant independent variables on life expectancy. It serves as the most accurate representation of the data, offering the optimal equation for Multiple Linear Regression.
#RESULTS
DF
| Model name | Independent Variables | MAE | MSE | RMSE | R_SQ | |
|---|---|---|---|---|---|---|
| 0 | Model 1 | Adult Mortality, Alcohol, percentage expenditu... | 3.0756719082362913 | 16.037107579841344 | 4.004635761194936 | 0.812440597177055 |
| 1 | Model 2 | Adult Mortality, Alcohol, percentage expenditu... | 2.8738818423360657 | 14.660168731913682 | 3.828859978102318 | 0.8209650654574213 |
%%shell
jupyter nbconvert --to html /content/MultipleRegression.ipynb
[NbConvertApp] Converting notebook /content/MultipleRegression.ipynb to html [NbConvertApp] Writing 8036868 bytes to /content/MultipleRegression.html